# Alex F. Gazmararian
# agazmararian@gmail.com

library(tidyverse)
library(here)
library(tidylog)
library(readxl)
library(xml2)
library(purrr)

# Load all project functions
source(here("R", "load_functions.R"))

# Data source configuration
# XMLs are downloaded to data/input/ if not already present - no separate cache needed
urls <- list(
    senate = "https://www.senate.gov/general/contact_information/senators_cfm.xml",
    house = "https://clerk.house.gov/xml/lists/memberdata.xml",
    house_118 = "https://web.archive.org/web/20241001091427/https://clerk.house.gov/xml/lists/memberdata.xml"
)

data_paths <- list(
    senate = here("data", "input", "speaker_parties", "senators_cfm.xml"),
    house = here("data", "input", "speaker_parties", "house_members.xml"),
    house_118 = here("data", "input", "speaker_parties", "memberdata_118.xml"),
    governors = here("data", "input", "govparty", "governors_2024.xlsx")
)

# Create necessary directories
for (path in data_paths) {
    dir.create(dirname(path), recursive = TRUE, showWarnings = FALSE)
}

# Note: Helper functions (download_xml_safely, validate_state, validate_district, 
# convert_state_name, load_member_data) are available from R/ modules loaded above
# extract_last_name is defined locally in this script

# Helper function to download only if local file doesn't exist
download_if_missing <- function(url, dest_file) {
    if (file.exists(dest_file)) {
        message(sprintf("Local file already exists: %s", dest_file))
        return(TRUE)
    } else {
        # Ensure directory exists before downloading
        dest_dir <- dirname(dest_file)
        if (!dir.exists(dest_dir)) {
            dir.create(dest_dir, recursive = TRUE, showWarnings = FALSE)
            message(sprintf("Created directory: %s", dest_dir))
        }
        
        message(sprintf("Local file not found, downloading from: %s", url))
        download_result <- download_xml_safely(url, dest_file)
        
        if (download_result && file.exists(dest_file)) {
            message(sprintf("Successfully saved file to: %s", dest_file))
            return(TRUE)
        } else {
            warning(sprintf("Failed to download and save file: %s", dest_file))
            return(FALSE)
        }
    }
}

# Extract last name from politician names
extract_last_name <- function(name) {
    purrr::map_chr(name, function(n) {
        if (is.na(n) || n == "") return(NA_character_)
        n <- str_to_lower(str_trim(n))
        if (str_detect(n, ",")) {
            return(str_trim(str_split_fixed(n, ",", 2)[1]))
        }
        word(n, -1)
    })
}

# Governors
tryCatch({
    govparty <- read_xlsx(data_paths$governors, progress = FALSE)
}, error = function(e) {
    stop(sprintf("Failed to load governors Excel file: %s", e$message))
})

govparty <- govparty %>%
    mutate(
        gov_last = extract_last_name(governor),
        state_valid = map_lgl(state_abb, validate_state)
    ) %>%
    filter(state_valid) %>%
    select(state_abb, gov_last, party)

govparty <- govparty %>%
    # Add Roy Cooper, NC governor
    bind_rows(tibble(state_abb = "NC", gov_last = "cooper", party = "D")) %>%
    # Add Eric Holcomb, IN governor
    bind_rows(tibble(state_abb = "IN", gov_last = "holcomb", party = "R")) %>%
    # Add James Justice, WV governor
    bind_rows(tibble(state_abb = "WV", gov_last = "justice", party = "D")) %>%
    # Add Christopher Sununu, NH governor
    bind_rows(tibble(state_abb = "NH", gov_last = "sununu", party = "R")) %>%
    # Add Kristi Noem, SD governor
    bind_rows(tibble(state_abb = "SD", gov_last = "noem", party = "R")) %>%
    # Add John Bel Edwards, LA governor
    bind_rows(tibble(state_abb = "LA", gov_last = "edwards", party = "D")) %>%
    # Add Charles Baker, MA governor
    bind_rows(tibble(state_abb = "MA", gov_last = "baker", party = "R")) %>%
    # Add Jay Inslee, WA governor
    bind_rows(tibble(state_abb = "WA", gov_last = "inslee", party = "D")) %>%
    # Add Douglas Ducey, AZ governor
    bind_rows(tibble(state_abb = "AZ", gov_last = "ducey", party = "R")) %>%
    # Add Douglas Burgum, ND governor
    bind_rows(tibble(state_abb = "ND", gov_last = "burgum", party = "R")) %>%
    # Add Michael Parson, MO governor
    bind_rows(tibble(state_abb = "MO", gov_last = "parson", party = "R")) %>%
    # Add William Hutchinson, AR governor
    bind_rows(tibble(state_abb = "AR", gov_last = "hutchinson", party = "R"))

govparty <- govparty %>%
    rename(gov_party = party)

# Save governors data immediately (before processing Senate/House which may fail on network issues)
write_csv(govparty, here("data", "inter", "governors_processed.csv"))
message("[OK] Saved governors data to data/inter/governors_processed.csv")

# U.S. Senators
tryCatch({
    download_if_missing(urls$senate, data_paths$senate)
    senators_xml <- read_xml(data_paths$senate)
    senator_nodes <- xml_find_all(senators_xml, ".//member")
    senators_df <- map_dfr(senator_nodes, ~load_member_data(., "senate"))
}, error = function(e) {
    stop(sprintf("Failed to process Senate data: %s", e$message))
})

senators_df <- senators_df %>%
    mutate(
        last_name = str_to_lower(str_trim(last_name)),
        state_valid = map_lgl(state, validate_state)
    ) %>%
    filter(state_valid) %>%
    select(last_name, state, party) %>%
    rename(sen_party = party)

additional_senators <- tribble(
    ~last_name,    ~state, ~sen_party,
    "burr",        "NC",   "R",
    "portman",     "OH",   "R",
    "inhofe",      "OK",   "R",
    "cardin",      "MD",   "D",
    "casey",       "PA",   "D",
    "rubio",       "FL",   "R",
    "brown",       "OH",   "D",
    "sinema",      "AZ",   "D",
    "toomey",      "PA",   "R",
    "blunt",       "MO",   "R",
    "stabenow",    "MI",   "D",
    "shelby",      "AL",   "R",
    "braun",       "IN",   "R",
    "manchin",     "WV",   "D",
    "feinstein",   "CA",   "D",
    "butler",      "CA",   "D",
    "vance",       "OH",   "R", 
    "romney",      "UT",   "R"
)

senators_df <- bind_rows(senators_df, additional_senators)

# House of Representatives
tryCatch({
    download_if_missing(urls$house, data_paths$house)
    
    # Try to download 118th Congress archive with retry logic (web.archive.org can be slow)
    house_118_available <- FALSE
    for (attempt in 1:3) {
        tryCatch({
            if (!file.exists(data_paths$house_118)) {
                message(sprintf("Attempting to download 118th Congress data (attempt %d/3)...", attempt))
                # Increase timeout for web.archive.org
                old_timeout <- getOption("timeout")
                options(timeout = 120)
                download_if_missing(urls$house_118, data_paths$house_118)
                options(timeout = old_timeout)
            }
            if (file.exists(data_paths$house_118)) {
                house_118_available <- TRUE
                break
            }
        }, error = function(e) {
            message(sprintf("Attempt %d failed: %s", attempt, e$message))
            if (attempt < 3) Sys.sleep(5)  # Wait before retry
        })
    }

    house_xml <- read_xml(data_paths$house)
    member_nodes <- xml_find_all(house_xml, ".//member")
    
    if (house_118_available && file.exists(data_paths$house_118)) {
        house_xml118 <- read_xml(data_paths$house_118)
        member_nodes118 <- xml_find_all(house_xml118, ".//member")
        house_df <- bind_rows(
            map_dfr(member_nodes, ~load_member_data(., "house")),
            map_dfr(member_nodes118, ~load_member_data(., "house"))
        )
        message("[OK] Loaded House data from both current and 118th Congress archives")
    } else {
        house_df <- map_dfr(member_nodes, ~load_member_data(., "house"))
        message("[WARN] 118th Congress archive unavailable - using current House data only")
        message("       Some historical representatives may be missing party data")
    }
}, error = function(e) {
    stop(sprintf("Failed to process House data: %s", e$message))
})

house_df <- house_df %>%
    mutate(
        state = convert_state_name(state),
        state_valid = !is.na(state)
    ) %>%
    filter(state_valid) %>%
    select(-state_valid) %>%
    mutate(
        last_name = str_to_lower(str_trim(last_name)),
        district_clean = case_when(
            district == "At Large" ~ "1",
            TRUE ~ str_replace_all(district, "(st|nd|rd|th)", "")
        )
    )

non_numeric_districts <- c("Delegate", "Resident Commissioner")
house_df$district_clean[house_df$district %in% non_numeric_districts] <- NA

house_df <- house_df %>%
    mutate(
        district_clean = as.numeric(district_clean),
        district_valid = map_lgl(district_clean, validate_district)
    ) %>%
    filter(district_valid) %>%
    select(last_name, state, district_clean, party) %>%
    rename(
        district = district_clean,
        rep_party = party
    )

additional_reps <- tribble(
    ~last_name,      ~state, ~district, ~rep_party,
    "brooks",        "AL",   5,         "R",
    "gallego",       "AZ",   3,         "D",
    "gallego",       "AZ",   7,         "D",
    "schweikert",    "AZ",   6,         "R",
    "stanton",       "AZ",   9,         "D",
    "garcia",        "CA",   25,        "D",
    "matsui",        "CA",   6,         "D",
    "speier",        "CA",   14,        "D",
    "hinson",        "IA",   1,         "R",
    "huizenga",      "MI",   2,         "R",
    "levin",         "MI",   9,         "D",
    "mcclain",       "MI",   10,        "R",
    "moolenaar",     "MI",   4,         "R",
    "upton",         "MI",   6,         "R",
    "thompson",      "MS",   2,         "D",
    "bishop",        "NC",   9,         "R",
    "herrell",       "NM",   2,         "R",
    "sempolinski",   "NY",   23,        "R",
    "mullin",        "OK",   2,         "R",
    "lamb",          "PA",   17,        "D",
    "rice",          "SC",   7,         "R",
    "brady",         "TX",   8,         "R",
    "doggett",       "TX",   35,        "D",
    "lee",           "TX",   18,        "D",
    "taylor",        "TX",   3,         "R",
    "stewart",       "UT",   2,         "R",
    "walberg",       "MI",   7,         "R",
    "cooper",        "TN",   5,         "D",
    "mckinley",      "WV",   1,         "R"
)

additional_reps <- additional_reps %>%
    mutate(
        state_valid = map_lgl(state, validate_state),
        district_valid = map_lgl(district, validate_district)
    ) %>%
    filter(state_valid, district_valid) %>%
    select(-state_valid, -district_valid)

house_df <- bind_rows(house_df, additional_reps) %>%
    distinct()

# Note: governors data already saved earlier in script to ensure it persists
# even if Senate/House processing fails
write_csv(senators_df, here("data", "inter", "senators_processed.csv"))
write_csv(house_df, here("data", "inter", "house_processed.csv"))

message("[OK] Saved politician data to data/inter/")
